In smoking_status Unknown should be changed to NA.
Also, it can be ordered: never < formerly < smokes
ever_married can be recoded as 0/1 in accordance with
heart_disease and hypertension
Other predictors seem to be OK
df <- read_csv("data/healthcare-dataset-stroke-data.csv", col_types = "cfdfffffddcf", na = c("Unknown", "N/A"))
# if you set smoking_status to factor in col_types, na() won't work
df$smoking_status <- as_factor(df$smoking_status)
df$smoking_status <- fct_relevel(df$smoking_status, c("never smoked", "formerly smoked", "smokes"))
# married
df$ever_married <- factor(if_else(df$ever_married == "Yes", 1, 0))
# for models working properly
df$stroke <- factor(ifelse(df$stroke == 1, "yes", "no"), levels = c("no", "yes"))
dfSkip id column
df$id <- NULL
skimr::skim(df)| Name | df |
| Number of rows | 5110 |
| Number of columns | 11 |
| _______________________ | |
| Column type frequency: | |
| factor | 8 |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| gender | 0 | 1.0 | FALSE | 3 | Fem: 2994, Mal: 2115, Oth: 1 |
| hypertension | 0 | 1.0 | FALSE | 2 | 0: 4612, 1: 498 |
| heart_disease | 0 | 1.0 | FALSE | 2 | 0: 4834, 1: 276 |
| ever_married | 0 | 1.0 | FALSE | 2 | 1: 3353, 0: 1757 |
| work_type | 0 | 1.0 | FALSE | 5 | Pri: 2925, Sel: 819, chi: 687, Gov: 657 |
| Residence_type | 0 | 1.0 | FALSE | 2 | Urb: 2596, Rur: 2514 |
| smoking_status | 1544 | 0.7 | FALSE | 3 | nev: 1892, for: 885, smo: 789 |
| stroke | 0 | 1.0 | FALSE | 2 | no: 4861, yes: 249 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| age | 0 | 1.00 | 43.23 | 22.61 | 0.08 | 25.00 | 45.00 | 61.00 | 82.00 | ▅▆▇▇▆ |
| avg_glucose_level | 0 | 1.00 | 106.15 | 45.28 | 55.12 | 77.24 | 91.88 | 114.09 | 271.74 | ▇▃▁▁▁ |
| bmi | 201 | 0.96 | 28.89 | 7.85 | 10.30 | 23.50 | 28.10 | 33.10 | 97.60 | ▇▇▁▁▁ |
Target ‘stroke’ is imbalanced!
‘smoking_status’ completeness rate 0.7
df %>% group_by(stroke, smoking_status) %>% summarise(N=n())BMI’s complete rate 0.96
df %>% filter(is.na(bmi)) %>% group_by(stroke) %>% summarise(N=n())One ‘Other’ gender to be removed
df <- df %>% filter(gender != "Other")GGally::ggpairs(df, aes(color = stroke, alpha = 0.2, dotsize = 0.02),
upper = list(continuous = GGally::wrap("cor", size = 2.5)),
diag = list(continuous = "barDiag")) +
scale_color_brewer(palette = "Set1", direction = -1) +
scale_fill_brewer(palette = "Set1", direction = -1)ggplot(df, aes(stroke, age)) +
geom_boxplot(aes(fill = stroke), alpha = 0.5, varwidth = T, notch = T) +
geom_violin(aes(fill = stroke), alpha = 0.5) +
scale_fill_brewer(palette = "Set1", direction = -1) +
xlab("")OBS! There are observation with age much below 20 y.o., even close to 0!
These are very young kids or babies - should we even include them in the analysis?
If so, it will be prediction for adults. Also, stroke in kids probably has very different causes compared to stroke in adults/old folk.
ggplot(df, aes(stroke, age)) +
geom_violin(alpha=0.3) +
geom_jitter(alpha=0.2, size=0.8, width = 0.15, height = 0.1, aes(color = gender)) +
geom_boxplot(alpha = 0.2) +
scale_color_brewer(palette = "Set2", direction = -1)ggplot(df, aes(stroke, avg_glucose_level)) +
geom_boxplot(aes(fill = stroke), alpha = 0.5, varwidth = T, notch = T) +
geom_violin(aes(fill = stroke), alpha = 0.5) +
scale_fill_brewer(palette = "Set1", direction = -1) +
xlab("")This average glucose level is probably the results of fasting blood sugar test
If I correctly understand this CDC information on diabetes, values greater than 126 is evidence of diabetes. But 250? Is it realistic?
ggplot(df, aes(stroke, bmi)) +
geom_boxplot(aes(fill = stroke), alpha = 0.5, varwidth = T, notch = T) +
geom_violin(aes(fill = stroke), alpha = 0.5) +
scale_fill_brewer(palette = "Set1", direction = -1) +
xlab("")BMI over 40 is the 3rd class of obesity - BMI over 75 should not exist at all.
Let’s look at this weird points
ggplot(df, aes(age, bmi)) +
geom_point(aes(color = stroke), alpha = 0.8, size = 0.5) +
scale_fill_brewer(palette = "Set1", direction = -1) +
facet_grid(rows = vars(stroke)) +
guides(color = "none")Patients with BMI over 75 are also very young. Suspicious.
ggplot(df, aes(age, avg_glucose_level)) +
geom_point(aes(color = smoking_status), alpha = 0.6, size = 1) +
scale_fill_brewer(palette = "Set1", direction = -1) +
facet_grid(rows = vars(stroke)) +
guides()OBS! Kids are mainly ‘Unknown’ smoking status; both target groups are divided into two clusters – I’m curious why.
It is not gender, nor heart disease or any other factor we have in the data set.
ggplot(df, aes(smoking_status, age)) +
geom_boxplot(aes(fill = stroke), alpha = 0.5, varwidth = T, notch = T) +
scale_fill_brewer(palette = "Set1", direction = -1) +
xlab("")Kids are mainly non-smokers of course. Note the same two outliers of age below 20 in stroke-yes
ggplot(df, aes(avg_glucose_level, bmi)) +
geom_point(aes(color = age), alpha = 0.6, size = 1) +
scale_fill_brewer(palette = "Set1", direction = -1) +
facet_grid(rows = vars(stroke)) +
guides()BMI outliers: super high BMI but super low glucose levels? How’s that possible?
Can it be a bias introduced by testing protocol misuse? Like instead of measuring glucose before eating, in some samples it was done after eating?
gender <- df %>% group_by(stroke, gender) %>% summarize(N=n())
ggplot(gender, aes(stroke, N)) +
geom_bar(aes(fill=gender), alpha = 0.8, stat = "identity", position = "fill") +
scale_fill_brewer(palette = "Set2", direction = -1) +
ylab("proportion")Proportions in both stroke groups are roughly the same
hyptens <- df %>% group_by(stroke, hypertension) %>% summarize(N = n())
ggplot(hyptens, aes(stroke, N)) +
geom_bar(aes(fill = hypertension), alpha = 0.8, stat = "identity", position = "fill") +
scale_fill_brewer(palette = "Set2", direction = -1) +
ylab("proportion")Hypertension occurred more often in stroke-yes
heart <- df %>% group_by(stroke, heart_disease) %>% summarize(N=n())
ggplot(heart, aes(stroke, N)) +
geom_bar(aes(fill = heart_disease), alpha = 0.8, stat = "identity", position = "fill") +
scale_fill_brewer(palette = "Set2", direction = 1) +
ylab("proportion")married <- df %>% group_by(stroke, ever_married) %>% summarize(N=n())
ggplot(married, aes(stroke, N)) +
geom_bar(aes(fill = ever_married), alpha = 0.8, stat = "identity", position = "fill") +
scale_fill_brewer(palette = "Set2", direction = -1) +
ylab("proportion")Marriage is bad :)
work <- df %>% group_by(stroke, work_type) %>% summarize(N=n())
ggplot(work, aes(stroke, N)) +
geom_bar(aes(fill = work_type), alpha = 0.8, stat = "identity", position = "fill") +
scale_fill_brewer(palette = "Set2", direction = 1) +
ylab("proportion")It’s good to be a child
residence <- df %>% group_by(stroke, Residence_type) %>% summarize(N=n())
ggplot(residence, aes(stroke, N)) +
geom_bar(aes(fill = Residence_type), alpha = 0.8, stat = "identity", position = "fill") +
scale_fill_brewer(palette = "Set2", direction = 1) +
ylab("proportion")smoking <- df %>% group_by(stroke, smoking_status) %>% summarize(N=n())
ggplot(smoking, aes(stroke, N)) +
geom_bar(aes(fill = smoking_status), alpha = 0.8, stat = "identity", position = "fill") +
scale_fill_brewer(palette = "Set2", direction = 1) +
ylab("proportion")There are several suspicious outliers (like in BMI and glucose). I can safely remove BMI > 75, maybe even BMI > 60 (Remember that BMI > 40 is the highest class of obesity).
What is less safe - it’s removing non-adults (younger than 20 y.o.). On one hand they provide valid information (age is very important predictor), on the other hand their stroke cases are really sus and a lot of predictors do not have sense (or are obvious NAs) for non-adults (like smoking, marriage status, employment type, residence type etc.); model-based imputation of smoking_status in children doesn’t have sense as well, I should rather replace with “never smoked”.
Since, modelling using all predictors and observations has given me very moderate results (TPR = 1 comes with very high FPR and very low probability cutoff close to 0), I will try various trimming of the data.
Try ‘no kinds’ version
df_trim <- df %>% filter(bmi < 60, age > 20 )
skimr::skim(df_trim)| Name | df_trim |
| Number of rows | 3896 |
| Number of columns | 11 |
| _______________________ | |
| Column type frequency: | |
| factor | 8 |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| gender | 0 | 1.00 | FALSE | 2 | Fem: 2383, Mal: 1513, Oth: 0 |
| hypertension | 0 | 1.00 | FALSE | 2 | 0: 3451, 1: 445 |
| heart_disease | 0 | 1.00 | FALSE | 2 | 0: 3654, 1: 242 |
| ever_married | 0 | 1.00 | FALSE | 2 | 1: 3186, 0: 710 |
| work_type | 0 | 1.00 | FALSE | 4 | Pri: 2521, Sel: 757, Gov: 617, Nev: 1 |
| Residence_type | 0 | 1.00 | FALSE | 2 | Urb: 1988, Rur: 1908 |
| smoking_status | 752 | 0.81 | FALSE | 3 | nev: 1630, for: 807, smo: 707 |
| stroke | 0 | 1.00 | FALSE | 2 | no: 3688, yes: 208 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| age | 0 | 1 | 51.23 | 16.97 | 21.00 | 38.00 | 51.00 | 64.00 | 82.00 | ▆▇▇▆▆ |
| avg_glucose_level | 0 | 1 | 108.16 | 47.60 | 55.12 | 77.22 | 92.21 | 115.98 | 271.74 | ▇▃▁▁▁ |
| bmi | 0 | 1 | 30.49 | 6.93 | 11.30 | 25.60 | 29.40 | 34.30 | 59.70 | ▁▇▅▁▁ |
BMI is complete, in total approx. 2000 observations are gone
Using package mice
It uses polr - proportional odds model - for
smoking_status and pmm - predictive mean
matching - for bmi
library(mice)
imp_mice <- mice(df_trim)##
## iter imp variable
## 1 1 smoking_status
## 1 2 smoking_status
## 1 3 smoking_status
## 1 4 smoking_status
## 1 5 smoking_status
## 2 1 smoking_status
## 2 2 smoking_status
## 2 3 smoking_status
## 2 4 smoking_status
## 2 5 smoking_status
## 3 1 smoking_status
## 3 2 smoking_status
## 3 3 smoking_status
## 3 4 smoking_status
## 3 5 smoking_status
## 4 1 smoking_status
## 4 2 smoking_status
## 4 3 smoking_status
## 4 4 smoking_status
## 4 5 smoking_status
## 5 1 smoking_status
## 5 2 smoking_status
## 5 3 smoking_status
## 5 4 smoking_status
## 5 5 smoking_status
df_imp <- complete(imp_mice)Number of NAs in BMI: 0
Number of NAs in Smoking: 0
bmi_imp_comp <- bind_rows(select(df_trim, bmi, stroke) %>% mutate(type = rep("original", nrow(df_trim))),
select(df_imp, bmi, stroke) %>% mutate(type = rep("imputed", nrow(df_imp))))
ggplot(bmi_imp_comp, aes(bmi)) +
geom_histogram(aes(fill = type), alpha = 0.8) +
facet_grid(cols = vars(stroke))Means have not changed, which is good, I suppose.
smoke_imp_comp <- bind_rows(select(df_trim, smoking_status, stroke) %>% mutate(type = rep("original", nrow(df_trim))),
select(df_imp, smoking_status, stroke) %>% mutate(type = rep("imputed", nrow(df_imp))))
ggplot(smoke_imp_comp, aes(smoking_status)) +
geom_bar(aes(fill=type), alpha=0.8, position="dodge") +
facet_grid(cols = vars(stroke)) +
xlab("")+
theme(axis.text.x = element_text(angle=45, vjust = 0.5))Counts increased proportionally in all Smoking groups
Scale numeric features (including imputed BMI)
# use caret::preProcess()
# preProcValues <- preProcess(training, method = c("center", "scale"))
df_scaled <- df_imp %>%
select(avg_glucose_level, age, bmi) %>%
scale() %>%
data.frame()I’ve decided to omit smoking_status completely - it won’t be dummified
# select vars
to_dum <- df_imp %>% select(gender, work_type, Residence_type, smoking_status)
# make an obj
dummies <- dummyVars(~ ., data = to_dum)
# apply it
df_dummy <- data.frame(predict(dummies, newdata = to_dum))
head(df_dummy)df_proc <- bind_cols(df_scaled, df_dummy, select(df_trim, hypertension, heart_disease, ever_married, stroke))
head(df_proc)ROC-optimization is better when data is imbalanced
Kappa-optimization is also good
# for ROC
fit_ctrl_roc <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
repeats = 10,
allowParallel = T,
classProbs = T,
summaryFunction = twoClassSummary)
# for kappa
fit_ctrl_kp <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
repeats = 10,
allowParallel = T)
fit_ctrl_kp10 <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 10,
repeats = 10,
allowParallel = T)Imbalanced data - use SMOTE to create training data set, but not testing data set
set.seed(1234)
sample_set <- createDataPartition(y = df_proc$stroke, p = .75, list = FALSE)
df_train <- df_proc[sample_set,]
df_test <- df_proc[-sample_set,]
# DMwR::SMOTE for imbalanced data: over=225 and under=150 give me 1:1 ratio
df_train_smote <- SMOTE(stroke ~ ., data.frame(df_train), perc.over = 1725, perc.under = 106)
df_train_smote %>% group_by(stroke) %>% summarise(N=n())# start a cluster
library(doParallel)
cl <- makePSOCKcluster(THREADS)
registerDoParallel(cl)For imbalanced classes
set.seed(123)
fit_rf <- train(stroke ~ .,
data = df_train_smote,
metric = "Kappa",
method = "rf",
trControl = fit_ctrl_kp,
tuneGrid = expand.grid(.mtry = seq(2, 6, 0.5)), # I've tried all values greater than these
verbosity = 0,
verbose = FALSE)
fit_rf## Random Forest
##
## 5619 samples
## 19 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 10 times)
## Summary of sample sizes: 4495, 4495, 4495, 4495, 4496, 4494, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2.0 0.9608825 0.9217635
## 2.5 0.9605622 0.9211227
## 3.0 0.9656877 0.9313737
## 3.5 0.9675566 0.9351114
## 4.0 0.9674139 0.9348261
## 4.5 0.9671473 0.9342928
## 5.0 0.9681794 0.9363572
## 5.5 0.9683219 0.9366422
## 6.0 0.9682685 0.9365354
##
## Kappa was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 5.5.
#cl <- makePSOCKcluster(THREADS)
#registerDoParallel(cl)
set.seed(120)
fit_rf_roc <- train(stroke ~ .,
data = df_train_smote,
metric = "ROC",
method = "rf",
trControl = fit_ctrl_roc,
tuneGrid = expand.grid(.mtry = seq(2, 6, 0.5)),
verbosity = 0,
verbose = FALSE)
#stopCluster(cl)
fit_rf_roc## Random Forest
##
## 5619 samples
## 19 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 10 times)
## Summary of sample sizes: 4495, 4496, 4495, 4496, 4494, 4496, ...
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2.0 0.9871208 0.9795448 0.9412761
## 2.5 0.9869408 0.9798643 0.9411336
## 3.0 0.9894277 0.9890067 0.9412411
## 3.5 0.9905384 0.9911413 0.9431288
## 4.0 0.9904763 0.9910703 0.9425950
## 4.5 0.9905240 0.9912481 0.9427729
## 5.0 0.9910410 0.9913194 0.9441621
## 5.5 0.9912119 0.9903944 0.9454441
## 6.0 0.9911809 0.9905368 0.9449454
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 5.5.
imp_vars_rf <- varImp(fit_rf)
plot(imp_vars_rf, main = "Variable Importance with RF")it’s the same
a function for roc-stuff
get_roc <- function(fit.obj, testing.df){
pred_prob <- predict.train(fit.obj, newdata = testing.df, type = "prob")
pred_roc <- prediction(predictions = pred_prob$yes, labels = testing.df$stroke)
perf_roc <- performance(pred_roc, measure = "tpr", x.measure = "fpr")
return(list(perf_roc, pred_roc))
}# calculate ROC
perf_pred <- get_roc(fit_rf, df_test)
perf_rf <- perf_pred[[1]]
pred_rf <- perf_pred[[2]]
# take AUC
auc_rf <- round(unlist(slot(performance(pred_rf, measure = "auc"), "y.values")), 3)
# plot
plot(perf_rf, main = "RF-k ROC curve", col = "steelblue", lwd = 3)
abline(a = 0, b = 1, lwd = 3, lty = 2, col = 1)
legend(x = 0.7, y = 0.3, legend = paste0("AUC = ", auc_rf))# calculate ROC
perf_pred_roc <- get_roc(fit_rf_roc, df_test)
perf_rf_roc <- perf_pred_roc[[1]]
pred_rf_roc <- perf_pred_roc[[2]]
# take AUC
auc_rf_roc <- round(unlist(slot(performance(pred_rf_roc, measure = "auc"), "y.values")), 3)
# plot
plot(perf_rf_roc, main = "RF-r ROC curve", col = "steelblue", lwd = 3)
abline(a = 0, b = 1, lwd = 3, lty = 2, col = 1)
legend(x = 0.7, y = 0.3, legend = paste0("AUC = ", auc_rf_roc))So, we can adjust TPR/FPR cutoff to predict all strokes
At which probability cut-off, you’ll get TPR = 1.0?
# use pred_rf (pred_roc) object
plot(performance(pred_rf, measure = "tpr", x.measure = "cutoff"),
col="steelblue",
ylab = "Rate",
xlab="Probability cutoff")
plot(performance(pred_rf, measure = "fpr", x.measure = "cutoff"),
add = T, col = "red")
legend(x = 0.6,y = 0.7, c("TPR (Recall)", "FPR (1-Spec)"),
lty = 1, col =c('steelblue', 'red'), bty = 'n', cex = 1, lwd = 2)
#abline(v = 0.02, lwd = 2, lty=6)
title("RF-k")Vertical line at cutoff = 0.02 designates maximum TPR and maximum FPR. Ideal cutoff should be to the left of this line
# use pred_rf (pred_roc) object
plot(performance(pred_rf_roc, measure = "tpr", x.measure = "cutoff"),
col = "steelblue",
ylab = "Rate",
xlab = "Probability cutoff")
plot(performance(pred_rf_roc, measure = "fpr", x.measure = "cutoff"),
add = T, col = "red")
legend(x = 0.6,y = 0.7, c("TPR (Recall)", "FPR (1-Spec)"),
lty = 1, col = c('steelblue', 'red'), bty = 'n', cex = 1, lwd = 2)
#abline(v = 0.02, lwd = 2, lty=6)
title("RF-r")Vertical line at 0.02
Using desired cut-off: we want to maximize TPR (Sensitivity, Recall)!
According to the TPR/FPR plot (above) the optimal cutoff is
# predict probabilities
pred_prob_rf <- predict(fit_rf, newdata=df_test, type = "prob")
# choose your cut-off
cutoff = 0.01
# turn probabilities into classes
pred_class_rf <- ifelse(pred_prob_rf$yes > cutoff, "yes", "no")
pred_class_rf <- as.factor(pred_class_rf)
cm_rf <- confusionMatrix(data = pred_class_rf,
reference = df_test$stroke,
mode = "everything",
positive = "yes")
cm_rf## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 295 2
## yes 627 50
##
## Accuracy : 0.3542
## 95% CI : (0.3241, 0.3852)
## No Information Rate : 0.9466
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0422
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.96154
## Specificity : 0.31996
## Pos Pred Value : 0.07386
## Neg Pred Value : 0.99327
## Precision : 0.07386
## Recall : 0.96154
## F1 : 0.13717
## Prevalence : 0.05339
## Detection Rate : 0.05133
## Detection Prevalence : 0.69507
## Balanced Accuracy : 0.64075
##
## 'Positive' Class : yes
##
# predict probabilities
pred_prob_rf_roc <- predict(fit_rf_roc, newdata = df_test, type = "prob")
# choose your cut-off
cutoff = 0.01
# turn probabilities into classes
pred_class_rf_roc <- ifelse(pred_prob_rf_roc$yes > cutoff, "yes", "no")
pred_class_rf_roc <- as.factor(pred_class_rf_roc)
cm_rf <- confusionMatrix(data = pred_class_rf_roc,
reference = df_test$stroke,
mode = "everything",
positive = "yes")
cm_rf## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 295 2
## yes 627 50
##
## Accuracy : 0.3542
## 95% CI : (0.3241, 0.3852)
## No Information Rate : 0.9466
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0422
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.96154
## Specificity : 0.31996
## Pos Pred Value : 0.07386
## Neg Pred Value : 0.99327
## Precision : 0.07386
## Recall : 0.96154
## F1 : 0.13717
## Prevalence : 0.05339
## Detection Rate : 0.05133
## Detection Prevalence : 0.69507
## Balanced Accuracy : 0.64075
##
## 'Positive' Class : yes
##
10-fold CV
set.seed(122)
#cl <- makePSOCKcluster(THREADS)
#registerDoParallel(cl)
fit_adb <- train(stroke ~ .,
data = df_train_smote,
metric = "Kappa",
method = "AdaBoost.M1",
trControl = fit_ctrl_kp10,
tuneGrid = expand.grid(
.maxdepth = seq(10, 20, 2),
.mfinal = seq(150, 180, 10),
.coeflearn = c("Freund")),
verbosity = 0,
verbose = FALSE)
# coeflearn=Freund was chosen by automatic grid search, mfinal choice comes from there too
# stop CLuster
stopCluster(cl)
fit_adb## AdaBoost.M1
##
## 5619 samples
## 19 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 5057, 5057, 5056, 5057, 5057, 5057, ...
## Resampling results across tuning parameters:
##
## maxdepth mfinal Accuracy Kappa
## 10 150 0.9690510 0.9381009
## 10 160 0.9689264 0.9378516
## 10 170 0.9690332 0.9380653
## 10 180 0.9688020 0.9376028
## 12 150 0.9688196 0.9376381
## 12 160 0.9688729 0.9377447
## 12 170 0.9687481 0.9374950
## 12 180 0.9687839 0.9375666
## 14 150 0.9689262 0.9378512
## 14 160 0.9691400 0.9382789
## 14 170 0.9690687 0.9381362
## 14 180 0.9690867 0.9381722
## 16 150 0.9689977 0.9379943
## 16 160 0.9689440 0.9378869
## 16 170 0.9688195 0.9376379
## 16 180 0.9689975 0.9379938
## 18 150 0.9692650 0.9385287
## 18 160 0.9692470 0.9384928
## 18 170 0.9693538 0.9387064
## 18 180 0.9693004 0.9385996
## 20 150 0.9695314 0.9390616
## 20 160 0.9694779 0.9389546
## 20 170 0.9693534 0.9387055
## 20 180 0.9692821 0.9385631
##
## Tuning parameter 'coeflearn' was held constant at a value of Freund
## Kappa was used to select the optimal model using the largest value.
## The final values used for the model were mfinal = 150, maxdepth = 20
## and coeflearn = Freund.
# calculate ROC
perf_pred_adb <- get_roc(fit_adb, df_test)
perf_adb <- perf_pred_adb[[1]]
pred_adb <- perf_pred_adb[[2]]
# take AUC
auc_adb <- round(unlist(slot(performance(pred_adb, measure = "auc"), "y.values")), 3)
# plot
plot(perf_adb, main = "AdaBoost ROC curve", col = "steelblue", lwd = 3)
abline(a = 0, b = 1, lwd = 3, lty = 2, col = 1)
legend(x = 0.7, y = 0.3, legend = paste0("AUC = ", auc_adb))At which probability cut-off, you’ll get TPR = 1.0?
# use pred_rf (pred_roc) object
plot(performance(pred_adb, measure = "tpr", x.measure = "cutoff"),
col="steelblue",
ylab = "Rate",
xlab="Probability cutoff")
plot(performance(pred_adb, measure = "fpr", x.measure = "cutoff"),
add = T, col = "red")
legend(x = 0.6,y = 0.7, c("TPR (Recall)", "FPR (1-Spec)"),
lty = 1, col =c('steelblue', 'red'), bty = 'n', cex = 1, lwd = 2)
#abline(v = 0.1, lwd = 2, lty=6)
title("AdaBoost.M1")pred_prob_adb <- predict(fit_adb, newdata = df_test, type = "prob")
# choose your cut-off
cutoff = 0.12
# turn probabilities into classes
pred_class_adb <- ifelse(pred_prob_adb$yes > cutoff, "yes", "no")
pred_class_adb <- as.factor(pred_class_adb)
cm_adb <- confusionMatrix(data = pred_class_adb,
reference = df_test$stroke,
mode = "everything",
positive = "yes")
cm_adb## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 231 0
## yes 691 52
##
## Accuracy : 0.2906
## 95% CI : (0.2622, 0.3202)
## No Information Rate : 0.9466
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0345
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.00000
## Specificity : 0.25054
## Pos Pred Value : 0.06999
## Neg Pred Value : 1.00000
## Precision : 0.06999
## Recall : 1.00000
## F1 : 0.13082
## Prevalence : 0.05339
## Detection Rate : 0.05339
## Detection Prevalence : 0.76283
## Balanced Accuracy : 0.62527
##
## 'Positive' Class : yes
##
xgbTree has 7 parameters
10-fold CV
set.seed(121)
fit_xgb_kp <- train(stroke ~ .,
data = df_train_smote,
method = "xgbTree",
metric = "Kappa",
trControl = fit_ctrl_kp10,
tuneGrid = expand.grid(
.nrounds = 100,
.max_depth = seq(3, 15, 1),
.eta = 0.3,
.gamma = 0.01,
.colsample_bytree = 1,
.min_child_weight = 1,
.subsample = 1
),
verbose = FALSE,
verbosity = 0)
fit_xgb_kp$bestTuneimp_vars_xgb <- varImp(fit_xgb_kp)
plot(imp_vars_xgb, main = "Variable Importance with XGB")# calculate ROC
perf_pred_xgb <- get_roc(fit_xgb_kp, df_test)
perf_xgb <- perf_pred_xgb[[1]]
pred_xgb <- perf_pred_xgb[[2]]
# take AUC
auc_xgb <- round(unlist(slot(performance(pred_xgb, measure = "auc"), "y.values")), 3)
# plot
plot(perf_xgb, main = "XGB ROC curve", col = "steelblue", lwd = 3)
abline(a = 0, b = 1, lwd = 3, lty = 2, col = 1)
legend(x = 0.7, y = 0.3, legend = paste0("AUC = ", auc_xgb))# use pred_xgb object
plot(performance(pred_xgb, measure = "tpr", x.measure = "cutoff"),
col = "steelblue",
ylab = "Rate",
xlab = "Probability cutoff")
plot(performance(pred_xgb, measure = "fpr", x.measure = "cutoff"),
add = T, col = "red")
legend(x = 0.6,y = 0.7, c("TPR (Recall)", "FPR (1-Spec)"),
lty = 1, col = c('steelblue', 'red'), bty = 'n', cex = 1, lwd = 2)
#abline(v = 0.1, lwd = 2, lty=6)
title("xgbTree")pred_prob_xgb <- predict(fit_xgb_kp, newdata=df_test, type = "prob")
# choose your cut-off
cutoff = 0.12
# turn probabilities into classes
pred_class_xgb <- ifelse(pred_prob_xgb$yes > cutoff, "yes", "no")
pred_class_xgb <- as.factor(pred_class_xgb)
cm_xgb <- confusionMatrix(data = pred_class_xgb,
reference = df_test$stroke,
mode = "everything",
positive = "yes")
cm_xgbsave.image("data/workspace.RData")